suppressPackageStartupMessages({
library(knitr)
library(tidyverse)
library(Seurat)
library(SeuratData)
library(scater)
library(zellkonverter)
library(SingleCellExperiment)
library(EnsDb.Mmusculus.v79)
library(Signac)
})
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
The dataset consists of multiome data (scATAC-seq & scRNA-seq) from cells at the four mentioned timepoints.
We can read in the .h5ad file as a SummarizedCellExperiment
rna_gastr_SE <- readH5AD("jupyter_notebooks/anndata_rna.h5ad")
atac_gastr_SE <- readH5AD("jupyter_notebooks/anndata_atac_peak_matrix.h5ad")
print(paste0("The RNA data has dimensions ", paste(dim(rna_gastr_SE), collapse = ", ")))
## [1] "The RNA data has dimensions 32285, 45991"
print(paste0("The ATAC data has dimensions ", paste(dim(atac_gastr_SE), collapse = ", ")))
## [1] "The ATAC data has dimensions 180499, 45991"
We can convert SummarizedCellExperiment to a Seurat object. The SummarizedExperiment contains the raw counts of the gene expression. Additionally we can add the metadata information to the Seurat objects.
rna_seurat <- as.Seurat(rna_gastr_SE, counts = "X", data = "X")
atac_seurat <- as.Seurat(atac_gastr_SE, counts = "X", data = "X")
# Lets add the metadata to the Seurat object
rna_seurat <- AddMetaData(rna_seurat, as.data.frame(colData(rna_gastr_SE)))
atac_seurat <- AddMetaData(atac_seurat, as.data.frame(colData(atac_gastr_SE)))
rna_seurat <- AddMetaData(rna_seurat, atac_seurat@meta.data %>% dplyr::select(BlacklistRatio:FRIP))
rna_seurat@meta.data %>% head %>% knitr::kable()
| orig.ident | nCount_originalexp | nFeature_originalexp | sample | stage | nFeature_RNA | mitochondrial_percent_RNA | ribosomal_percent_RNA | celltype | BlacklistRatio | nDiFrags | nFrags | nMonoFrags | nMultiFrags | NucleosomeRatio | PassQC | PromoterRatio | ReadsInBlacklist | ReadsInPromoter | ReadsInTSS | Sample | TSSEnrichment | barcode | nCount_RNA | pass_rnaQC | doublet_score | doublet_call | celltype.mapped_mnn | celltype.score_mnn | closest.cell | celltype.mapped_seurat | celltype.score_seurat | TSSEnrichment_atac | ReadsInTSS_atac | PromoterRatio_atac | NucleosomeRatio_atac | nFrags_atac | BlacklistRatio_atac | ReadsInPeaks | FRIP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| E7.5_rep1#AAACAGCCAGGAACTG-1 | E7.5 | 3268 | 1636 | E7.5_rep1 | E7.5 | 1636 | 22.55 | 6.55 | Primitive_Streak | 0.0245961 | 9409 | 26061 | 11164 | 5488 | 1.334378 | 1 | 0.1753770 | 1282 | 9141 | 2170 | E7.5_rep1 | 12.277 | AAACAGCCAGGAACTG-1 | 3268 | TRUE | 0.05 | FALSE | Primitive_Streak | 0.84 | cell_114243 | Epiblast | 0.95 | 12.28 | 2170 | 0.18 | 1.33 | 26061 | 0.02 | 18291 | 0.3509267 |
| E7.5_rep1#AAACAGCCATCCTGAA-1 | E7.5 | 6034 | 2677 | E7.5_rep1 | E7.5 | 2677 | 21.63 | 6.36 | Nascent_mesoderm | 0.0131758 | 8183 | 22048 | 9047 | 4818 | 1.437051 | 1 | 0.1533246 | 581 | 6761 | 1596 | E7.5_rep1 | 10.097 | AAACAGCCATCCTGAA-1 | 6034 | TRUE | 0.11 | FALSE | Nascent_mesoderm | 0.56 | cell_87889 | Nascent_mesoderm | 0.53 | 10.10 | 1596 | 0.15 | 1.44 | 22048 | 0.01 | 17007 | 0.3857862 |
| E7.5_rep1#AAACAGCCATGCTATG-1 | E7.5 | 11993 | 3985 | E7.5_rep1 | E7.5 | 3985 | 24.96 | 8.14 | Primitive_Streak | 0.0168382 | 12809 | 37415 | 16731 | 7875 | 1.236268 | 1 | 0.1664038 | 1260 | 12452 | 2770 | E7.5_rep1 | 9.157 | AAACAGCCATGCTATG-1 | 11993 | TRUE | 0.52 | FALSE | Primitive_Streak | 0.64 | cell_103930 | Rostral_neurectoderm | 0.50 | 9.16 | 2770 | 0.17 | 1.24 | 37415 | 0.02 | 27761 | 0.3711364 |
| E7.5_rep1#AAACATGCAACCTGGT-1 | E7.5 | 11572 | 3703 | E7.5_rep1 | E7.5 | 3703 | 14.47 | 12.73 | Parietal_endoderm | 0.0451070 | 6366 | 13080 | 3807 | 2907 | 2.435776 | 1 | 0.2454128 | 1180 | 6420 | 1722 | E7.5_rep1 | 17.397 | AAACATGCAACCTGGT-1 | 11572 | TRUE | 0.52 | FALSE | Parietal_endoderm | 1.00 | cell_61845 | Parietal_endoderm | 1.00 | 17.40 | 1722 | 0.25 | 2.44 | 13080 | 0.05 | 10645 | 0.4071991 |
| E7.5_rep1#AAACATGCAATGAATG-1 | E7.5 | 10924 | 4030 | E7.5_rep1 | E7.5 | 4030 | 15.93 | 5.32 | Somitic_mesoderm | 0.0154294 | 8281 | 19411 | 6073 | 5057 | 2.196279 | 1 | 0.2148524 | 599 | 8341 | 2149 | E7.5_rep1 | 16.820 | AAACATGCAATGAATG-1 | 10924 | TRUE | 0.36 | FALSE | Somitic_mesoderm | 0.84 | cell_121109 | Somitic_mesoderm | 0.87 | 16.82 | 2149 | 0.21 | 2.20 | 19411 | 0.02 | 16046 | 0.4135354 |
| E7.5_rep1#AAACATGCAATTATGC-1 | E7.5 | 7175 | 3115 | E7.5_rep1 | E7.5 | 3115 | 18.16 | 7.01 | Gut | 0.0151415 | 3495 | 9114 | 3186 | 2433 | 1.860640 | 1 | 0.1916831 | 276 | 3494 | 827 | E7.5_rep1 | 11.372 | AAACATGCAATTATGC-1 | 7175 | TRUE | 0.10 | FALSE | Gut | 0.88 | cell_48964 | Gut | 1.00 | 11.37 | 827 | 0.19 | 1.86 | 9114 | 0.02 | 7385 | 0.4054573 |
atac_seurat@meta.data %>% head %>% knitr::kable()
| orig.ident | nCount_originalexp | nFeature_originalexp | BlacklistRatio | nDiFrags | nFrags | nMonoFrags | nMultiFrags | NucleosomeRatio | PassQC | PromoterRatio | ReadsInBlacklist | ReadsInPromoter | ReadsInTSS | Sample | TSSEnrichment | barcode | sample | nFeature_RNA | nCount_RNA | mitochondrial_percent_RNA | ribosomal_percent_RNA | stage | pass_rnaQC | doublet_score | doublet_call | celltype.mapped_mnn | celltype.score_mnn | closest.cell | celltype.mapped_seurat | celltype.score_seurat | TSSEnrichment_atac | ReadsInTSS_atac | PromoterRatio_atac | NucleosomeRatio_atac | nFrags_atac | BlacklistRatio_atac | ReadsInPeaks | FRIP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| E8.5_rep1#TTACGTTTCTGGCATG-1 | E8.5 | 105065 | 46293 | 0.0175625 | 98930 | 221979 | 74921 | 48128 | 1.962841 | 1 | 0.1424189 | 7797 | 63228 | 15431 | E8.5_rep1 | 10.457 | TTACGTTTCTGGCATG-1 | E8.5_rep1 | 6842 | 23641 | 0.79 | 2.94 | E8.5 | TRUE | 0.85 | FALSE | ExE_endoderm | 0.72 | cell_90417 | Visceral_endoderm | 0.47 | 10.46 | 15431 | 0.14 | 1.96 | 221979 | 0.02 | 122008 | 0.2749303 |
| E8.5_rep1#TCCTCTAAGTCCTTCA-1 | E8.5 | 97181 | 42026 | 0.0153818 | 56865 | 150503 | 65362 | 28276 | 1.302607 | 1 | 0.1762855 | 4630 | 53063 | 13732 | E8.5_rep1 | 12.839 | TCCTCTAAGTCCTTCA-1 | E8.5_rep1 | 4936 | 12218 | 0.46 | 2.83 | E8.5 | TRUE | 0.43 | FALSE | ExE_mesoderm | 0.92 | cell_99030 | ExE_mesoderm | 0.74 | 12.84 | 13732 | 0.18 | 1.30 | 150503 | 0.02 | 110797 | 0.3681404 |
| E8.5_rep1#TATCCAGCACAGACTC-1 | E8.5 | 91323 | 37261 | 0.0186042 | 68122 | 150154 | 53887 | 28145 | 1.786461 | 1 | 0.1928220 | 5587 | 57906 | 15136 | E8.5_rep1 | 14.320 | TATCCAGCACAGACTC-1 | E8.5_rep1 | 8194 | 56000 | 17.93 | 5.12 | E8.5 | TRUE | 1.03 | FALSE | ExE_endoderm | 1.00 | cell_26080 | ExE_endoderm | 1.00 | 14.32 | 15136 | 0.19 | 1.79 | 150154 | 0.02 | 110569 | 0.3682810 |
| E8.5_rep1#GAGAACCAGACACTTA-1 | E8.5 | 98358 | 43749 | 0.0168635 | 62358 | 149079 | 57677 | 29044 | 1.584722 | 1 | 0.1802870 | 5028 | 53754 | 13554 | E8.5_rep1 | 12.929 | GAGAACCAGACACTTA-1 | E8.5_rep1 | 3434 | 8178 | 23.05 | 7.65 | E8.5 | TRUE | 0.61 | FALSE | ExE_endoderm | 1.00 | cell_82904 | ExE_endoderm | 1.00 | 12.93 | 13554 | 0.18 | 1.58 | 149079 | 0.02 | 110888 | 0.3719776 |
| E8.5_rep1#TCAAGTATCTTAGCGG-1 | E8.5 | 89709 | 38467 | 0.0168807 | 65453 | 136754 | 43825 | 27476 | 2.120456 | 1 | 0.2157962 | 4617 | 59022 | 14931 | E8.5_rep1 | 13.170 | TCAAGTATCTTAGCGG-1 | E8.5_rep1 | 4867 | 12848 | 5.68 | 7.35 | E8.5 | TRUE | 1.14 | FALSE | Erythroid1 | 0.80 | cell_77053 | Erythroid1 | 0.78 | 13.17 | 14931 | 0.22 | 2.12 | 136754 | 0.02 | 105227 | 0.3849027 |
| E8.5_rep1#CCCTTAATCTCACAAA-1 | E8.5 | 82843 | 37384 | 0.0158621 | 48689 | 122525 | 49548 | 24288 | 1.472855 | 1 | 0.1740788 | 3887 | 42658 | 10573 | E8.5_rep1 | 11.612 | CCCTTAATCTCACAAA-1 | E8.5_rep1 | 5022 | 12406 | 3.83 | 4.32 | E8.5 | TRUE | 1.00 | FALSE | Mesenchyme | 0.96 | cell_131042 | Mesenchyme | 0.55 | 11.61 | 10573 | 0.17 | 1.47 | 122525 | 0.02 | 91139 | 0.3719868 |
tibble(sample = rna_seurat@meta.data$sample, umi_per_cell = Matrix::colSums(rna_seurat@assays$originalexp @counts)) %>%
arrange(sample, desc(umi_per_cell)) %>%
group_by(sample) %>%
mutate(idx = seq_along(sample)) %>%
mutate(cum_umi_per_cell = cumsum(umi_per_cell)) %>%
ggplot() +
geom_line(aes(x = idx, y = cum_umi_per_cell)) +
facet_wrap(~sample, scales = "fixed") +
scale_y_log10() + scale_x_log10() +
ylab("cumulative UMI count") +
xlab("index")
From the plots below it seems that all cells with percentage of mitochondrial genes above 40% were removed. The early time points E7.5rep1/2 have the highest percentage of mitochondrial genes. Especially E7.5 rep2 seems to have a higher percentage of mitochondrial genes. Conversely, the same replicates from E7.5 have a lower number of features and counts. This means that the E7.5 rep2 and to a lesser extent E7.5 rep1 are of lower quality compared to the other samples. E8.75 rep1/rep2 seem to have the highest quality.
#rename metadata
variables <- c("nFeature_originalexp", "nCount_originalexp", "mitochondrial_percent_RNA", "ribosomal_percent_RNA")
plots <- map(variables, function(n){
df <- rna_seurat@meta.data
ggplot(df) +
geom_boxplot(aes(x = df %>% pull("sample"), y = df %>% pull(n),
fill = df %>% pull("sample")), alpha = .1) +
geom_violin(aes(x = df %>% pull("sample"), y = df %>% pull(n),
fill = df %>% pull("sample")), alpha = .5) +
xlab("sample") +
ylab(paste0(n)) +
labs(title = paste0(n)) +
guides(fill=guide_legend(title="sample"))
})
gridExtra::grid.arrange(grobs = plots, ncol = 2)
There are two different methods for mapping listed in the metadata:
atac_seurat@meta.data %>% group_by(celltype.mapped_seurat) %>%
summarise(n = n()) %>% knitr::kable(caption = "Number of celltypes using Seurat")
| celltype.mapped_seurat | n |
|---|---|
| Allantois | 735 |
| Anterior_Primitive_Streak | 86 |
| Blood_progenitors_1 | 148 |
| Blood_progenitors_2 | 526 |
| Cardiomyocytes | 895 |
| Caudal_Mesoderm | 240 |
| Caudal_epiblast | 643 |
| Caudal_neurectoderm | 54 |
| Def._endoderm | 301 |
| Endothelium | 651 |
| Epiblast | 976 |
| Erythroid1 | 761 |
| Erythroid2 | 508 |
| Erythroid3 | 1096 |
| ExE_ectoderm | 3337 |
| ExE_endoderm | 6000 |
| ExE_mesoderm | 1434 |
| Forebrain_Midbrain_Hindbrain | 3588 |
| Gut | 2126 |
| Haematoendothelial_progenitors | 998 |
| Intermediate_mesoderm | 942 |
| Mesenchyme | 3201 |
| Mixed_mesoderm | 425 |
| NMP | 1699 |
| Nascent_mesoderm | 946 |
| Neural_crest | 529 |
| Notochord | 169 |
| PGC | 164 |
| Paraxial_mesoderm | 2161 |
| Parietal_endoderm | 528 |
| Pharyngeal_mesoderm | 1655 |
| Primitive_Streak | 316 |
| Rostral_neurectoderm | 1565 |
| Somitic_mesoderm | 1618 |
| Spinal_cord | 1527 |
| Surface_ectoderm | 2804 |
| Visceral_endoderm | 639 |
atac_seurat@meta.data %>% group_by(celltype.mapped_mnn) %>%
summarise(n = n()) %>% knitr::kable(caption = "Number of celltypes using MNN")
| celltype.mapped_mnn | n |
|---|---|
| Allantois | 661 |
| Anterior_Primitive_Streak | 83 |
| Blood_progenitors_1 | 258 |
| Blood_progenitors_2 | 596 |
| Cardiomyocytes | 874 |
| Caudal_Mesoderm | 143 |
| Caudal_epiblast | 566 |
| Caudal_neurectoderm | 24 |
| Def._endoderm | 503 |
| Endothelium | 649 |
| Epiblast | 614 |
| Erythroid1 | 1196 |
| Erythroid2 | 887 |
| Erythroid3 | 165 |
| ExE_ectoderm | 3360 |
| ExE_endoderm | 6226 |
| ExE_mesoderm | 1199 |
| Forebrain_Midbrain_Hindbrain | 3514 |
| Gut | 1895 |
| Haematoendothelial_progenitors | 992 |
| Intermediate_mesoderm | 840 |
| Mesenchyme | 3209 |
| Mixed_mesoderm | 230 |
| NMP | 1636 |
| Nascent_mesoderm | 1094 |
| Neural_crest | 526 |
| Notochord | 142 |
| PGC | 40 |
| Paraxial_mesoderm | 2398 |
| Parietal_endoderm | 541 |
| Pharyngeal_mesoderm | 1842 |
| Primitive_Streak | 333 |
| Rostral_neurectoderm | 1965 |
| Somitic_mesoderm | 1708 |
| Spinal_cord | 1673 |
| Surface_ectoderm | 2784 |
| Visceral_endoderm | 625 |
The annotations are similar across the different samples. However, there are some differences in the confidence with which cells can me mapped for different celltypes. For example, cardiomyocytes, erythroids and extraembryonic ectoderm and endoderm (but not mesoderm), mesenchyme and parietal endoderm can be mapped with very high confidence.
atac_seurat@meta.data %>%
ggplot() +
geom_boxplot(aes(x = celltype.mapped_seurat, y = celltype.score_seurat, fill = celltype.mapped_seurat)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.8, hjust=1)) + NoLegend()
atac_seurat@meta.data %>%
ggplot() +
geom_boxplot(aes(x = celltype.mapped_mnn, y = celltype.score_mnn, fill = celltype.mapped_mnn)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.8, hjust=1)) + NoLegend()
rna_seurat <- rna_seurat %>%
NormalizeData(verbose = FALSE) %>%
ScaleData(verbose = FALSE) %>%
FindVariableFeatures(verbose = FALSE)
#gastr_seurat@assays$RNA@data[0:10, 0:10]
I will proceed with 15 PCs.
rna_seurat <- RunPCA(rna_seurat, features = VariableFeatures(rna_seurat),
verbose = FALSE)
ElbowPlot(rna_seurat)
pca_plots <- comprehenr::to_list(for (i in 1:15)
DimPlot(rna_seurat, reduction = "pca", dims = i:(i+1)) +
theme())
#pca_plots
#gridExtra::grid.arrange(unlist(pca_plots), ncol = 3, nrow = 5)
Trying a different resolution for the clustering to see if the Mesenchyme will still separate.
rna_seurat <- FindNeighbors(rna_seurat, verbose = FALSE)
rna_seurat <- FindClusters(rna_seurat, verbose = FALSE, resolution = 0.6)
rna_seurat <- RunUMAP(rna_seurat, verbose = FALSE, dims = 1:15)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
colPalette_celltypes = c('#532C8A',
'#c19f70',
'#f9decf',
'#c9a997',
'#B51D8D',
'#3F84AA',
'#9e6762',
'#354E23',
'#F397C0',
'#ff891c',
'#635547',
'#C72228',
'#f79083',
'#EF4E22',
'#989898',
'#7F6874',
'#8870ad',
'#647a4f',
'#EF5A9D',
'#FBBE92',
'#139992',
'#cc7818',
'#DFCDE4',
'#8EC792',
'#C594BF',
'#C3C388',
'#0F4A9C',
'#FACB12',
'#8DB5CE',
'#1A1A1A',
'#C9EBFB',
'#DABE99',
'#65A83E',
'#005579',
'#CDE088',
'#f7f79e',
'#F6BFCB')
There are two possibilities to proceed with celltype annotations:
In the plot below you can see that the Mesenchyme separates into two clusters, even though the cells belong to the same celltype. Therefore I visualized the Mesenchyme cells according to their respective time point and replicate below. It becomes evident that the Mesenchyme separate into E7.5 and E8.0, E8.5, E8.75. Therefore, the separation is probably a biological effect corresponding to different signatures at different time points. However, we should keep in mind that E7.5 was also the time point with lowest quality of cells.
There is one cluster which contains a very heterogeneous population of cells, namely mixed & nascent mesoderm, caudal & rostral neuroectoderm, primitive streak, caudal epiblast and epiblast. As we will see later on, these are primarily early celltypes found at E7.5, which are not present at later timepoints anymore.
celltypes <- (atac_seurat@meta.data %>% group_by(celltype.mapped_seurat) %>%
summarise(n = n()))$celltype.mapped_seurat
col <- setNames(colPalette_celltypes, celltypes)
DimPlot(rna_seurat, reduction = "umap", pt.size = 1,
group.by = "celltype.mapped_seurat", label = TRUE, repel = TRUE, cols = col) +
NoLegend()
DimPlot(subset(rna_seurat, celltype.mapped_seurat == "Mesenchyme"),
group.by = "sample")
DimPlot(subset(rna_seurat, celltype.mapped_seurat == "Mesenchyme"),
split.by = "sample")
Plot with color legend.
DimPlot(rna_seurat, reduction = "umap", pt.size = 1,
group.by = "celltype.mapped_seurat", cols = col) #, label = TRUE, repel = TRUE) +
Clustering by Seurat with resolution = 0.6 yields 17 distinct clusters, not enough to differentiate the large number of different celltypes in this dataset.
DimPlot(rna_seurat, reduction = "umap", pt.size = .1,
group.by = "seurat_clusters", label = TRUE) +
NoLegend()
Below, you can see how the earlier time points separate from the later time points. The number of counts is very homogenous, while the mitochondrial RNA percentage is higher and the number of features lower for E7.5 as already pointed out.
p1 <- DimPlot(rna_seurat, reduction = "umap", pt.size = .1, group.by = "sample")
p2 <- FeaturePlot(rna_seurat, reduction = "umap", pt.size = .1,
features = "nCount_RNA") +
scale_color_viridis_c()
p3 <- FeaturePlot(rna_seurat, reduction = "umap", pt.size = .1, features = "mitochondrial_percent_RNA") +
scale_color_viridis_c()
p4 <- FeaturePlot(rna_seurat, reduction = "umap", pt.size = .1, features = "nFeature_RNA") +
scale_color_viridis_c()
gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)
Visualizing the cells at each timepoint in separate UMAPs shows how the distribution of celltypes changes as gastrulation proceeds. The heterogenous cell cluster present at E7.5 disappears at later time points. We can also see how no Erythroids are present at E7.5, Erythroid1 (and to lesser extend Erythroid2 and 3) appear at E8.0, while Eryhtroid3 are found to a higher extent at E8.75, which indicates that these cells correspond to different developmental stages.
DimPlot(rna_seurat, reduction = "umap", pt.size = 1,
group.by = "celltype.mapped_seurat", split.by = "orig.ident", ncol = 1, cols = col) +
labs(title = "Celltypes at different time points")
We can see that nascent mesoderm, epiblast and primitive streak are only present at E75. Also, extraembryonice endoderm and ectoderm are highest and decrease with progressing gastrulation. Forebrain/Midbrain/Hindbrain, neural crest and neuromesodermal progenitor (NMP) cells are not present at E7.5, but emerge at E8.0 and are present at even higher percentage at E8.5. Also, the endothelium appears only at E8.0. The same is true for Allantois, erythroids (produce red blood cells, remain in bone marrow)and cardiomyocytes. Conversely, the number of extraembryonic ectoderm cells and extraembryonic endoderm cells become less at each time step.
(Pijuan_Sala et.al, A single-cell molecular map of mouse gastrulation and early organogenesis, 2019, Nature)
# plot the frequency of each cell type at each embryonic stage
# all frequencies for one embryonic stage would add up to 0
rna_seurat@meta.data %>%
group_by(orig.ident, celltype.mapped_seurat) %>%
summarise(Total = n()) %>%
mutate(freq = Total/sum(Total)) %>%
ggplot(aes(x = celltype.mapped_seurat, y = freq, fill = orig.ident)) +
geom_bar(position = "dodge", stat = "identity") +
theme(axis.text.x = element_text(angle = 45, vjust = 0.8, hjust=1))
When doing the preprocessing steps and dimensionality reduction for each time point separately, we can create more clearly separated clusters for the later time points, however, for E7.5 the mixed cluster of epiblast, mixed & nascent mesoderm, primitive streak and PGC remains a heterogeneous cluster.
stages <- c("E7.5", "E8.0", "E8.5", "E8.75")
seurat_objects <- map(stages, function(n){
rna_seurat <- subset(rna_seurat, subset = orig.ident == n)
rna_seurat <- rna_seurat %>%
NormalizeData(verbose = FALSE) %>%
ScaleData(verbose = FALSE) %>%
FindVariableFeatures(verbose = FALSE)
rna_seurat <- rna_seurat %>% RunPCA(features = VariableFeatures(rna_seurat), verbose = FALSE) %>%
FindNeighbors(verbose = FALSE) %>%
FindClusters(verbose = FALSE, resolution = .9) %>%
RunUMAP(verbose = TRUE, dims = 1:15)
list(name = n, object = rna_seurat)
})
e75 <- seurat_objects[[1]]$object
e80 <- seurat_objects[[2]]$object
e85 <- seurat_objects[[3]]$object
e785 <- seurat_objects[[4]]$object
plots <- map(seq.int(1,4), function(n){
object <- seurat_objects[[n]]$object
p1 <- DimPlot(object, reduction = "umap", group.by = "celltype.mapped_seurat",
cols = col, size = .9) +
labs(title = paste0(seurat_objects[[n]]$name, " celltypes"))
list(plot = p1)
})
gridExtra::grid.arrange(plots[[1]]$p, plots[[2]]$p, plots[[3]]$p,plots[[4]]$p, ncol = 1)
atac_seurat@meta.data %>%
ggplot() +
geom_density2d_filled(aes(x=log10(nFrags_atac ), y=TSSEnrichment_atac), bins=20) +
geom_hline(yintercept = 5, color="green", linetype="dashed") +
geom_vline(xintercept = 3, color="green", linetype="dashed") +
#geom_xsidedensity(aes(x=log10(pre_filter_meta$nFrags))) +
#geom_ysidedensity(aes(y = pre_filter_meta$TSSEnrichment)) +
facet_wrap(~sample) +
theme(legend.position = "none") +
labs(x = "Log10 Unique Fragments", y = "TSS Enrichment Score")
The scATAC-seq shows that E7.5 seems to be of lower quality, which we have already observed in the scRNA-seq. Potentially, the cells where simply in a worse condition when sequenced.
p1 <- atac_seurat@meta.data %>%
ggplot() +
ggridges::geom_density_ridges(aes(x = TSSEnrichment, y = Sample, fill = Sample),
alpha = .6)
p2 <- atac_seurat@meta.data %>%
ggplot() +
geom_violin(aes(x = Sample, y = TSSEnrichment, fill = Sample), alpha = 0.6) +
geom_boxplot(aes(x = Sample, y = TSSEnrichment,fill = Sample), alpha = 0.1) +
theme(legend.position = "none") +
labs(title = "TSS Enrichment")
cowplot::plot_grid(p2, p1, ncol = 2)
p1 <- atac_seurat@meta.data %>%
ggplot() +
ggridges::geom_density_ridges(aes(x = nFrags_atac , y = Sample, fill = Sample),
alpha = .6)
p2 <- atac_seurat@meta.data %>%
ggplot() +
geom_violin(aes(x = Sample, y = nFrags_atac , fill = Sample), alpha = 0.6) +
geom_boxplot(aes(x = Sample, y = nFrags_atac ,fill = Sample), alpha = 0.1) +
theme(legend.position = "none") +
labs(title = "TSS Enrichment")
cowplot::plot_grid(p2, p1, ncol = 2)
Latent Semantic Indexing
atac_seurat <- RunTFIDF(atac_seurat)
atac_seurat <- FindTopFeatures(atac_seurat)
atac_seurat <- RunSVD(atac_seurat)
The first LSI component often captures sequencing depth. We will therefore remove it from downstream analysis. The correlation between sequencing depth and each LSI component is shown in the plot below.
DepthCor(atac_seurat)
atac_seurat <- RunUMAP(atac_seurat, reduction = "lsi", dims = 2:30, verbose = FALSE)
aatac_seurat <- FindNeighbors(atac_seurat, reduction = "lsi", dims = 2:30, verbose = FALSE)
# for Clsutering instead of Louvian SLM algorithm is used
#atac_seurat <- FindClusters(atac_seurat, verbose = FALSE, algorithm = 3)
DimPlot(atac_seurat, group.by = "celltype.mapped_seurat", reduction = "umap", pt.size = 1, cols = col) +
labs(title = "scATAC-seq Celltype")
DimPlot(atac_seurat, group.by = "sample", reduction = "umap", pt.size = 1, cols = col) +
labs(title = "scATAC-seq Celltype")
DimPlot(rna_seurat , group.by = "celltype.mapped_seurat", pt.size = 1, cols = col,
reduction = "umap") +
labs(title = "scRNA-seq Celltype")
Lets have a look at whether the number of fragments is correlated to the number of counts in each celltype. This seems to be the case.
atac_seurat@meta.data %>%
ggplot(aes(x = log10(nCount_originalexp ), y = log10(nFrags_atac))) +
geom_point(alpha = .2, size = .2) +
ggside::geom_xsidedensity() +
ggside::geom_ysidedensity() +
facet_wrap(~sample) +
labs(x = "Log10 Counts", y = "log10 Unique Fragments")
atac_seurat@meta.data %>%
ggplot(aes(x = log10(nCount_originalexp), y = log10(TSSEnrichment))) +
geom_point(size = .2, alpha = .2) +
ggside::geom_xsidedensity() +
ggside::geom_ysidedensity() +
facet_wrap(~sample)
atac_seurat@meta.data %>%
ggplot() +
geom_histogram(aes(x = PromoterRatio_atac))
atac_seurat@meta.data %>%
ggplot() +
geom_histogram(aes(x = NucleosomeRatio_atac))
saveRDS(atac_seurat, "atac_Seurat_object")
saveRDS(rna_seurat, "rna_Seurat_object")